# Define the list of packages
my_packages <- c(
## Data loading, cleaning, manipulation
"tidyverse", "dplyr", "data.table", "arrow", "purrr", "visdat",
## Visualization
"ggplot2", "ggrepel", "ggdist", "gghalves", "patchwork",
"ComplexHeatmap", "circlize", "dendextend",
## Descriptive statistics and summaries
"table1",
## Modeling and statistics
"reshape2", "fixest", "tictoc", "lme4", "lmerTest",
"broom.mixed", "effectsize", "modelsummary", "multcomp"
)
# Install any missing packages
installed_packages <- rownames(installed.packages())
missing_packages <- my_packages[!my_packages %in% installed_packages]
if(length(missing_packages) > 0) install.packages(missing_packages)
# Load the libraries and store the results (TRUE for success, FALSE for failure)
loaded <- sapply(my_packages, require, character.only = TRUE)
cat("✅ Package loading summary:\n")
## ✅ Package loading summary:
loaded_df <- data.frame(Package = names(loaded), Loaded = loaded)
print(loaded_df)
## Package Loaded
## tidyverse tidyverse TRUE
## dplyr dplyr TRUE
## data.table data.table TRUE
## arrow arrow TRUE
## purrr purrr TRUE
## visdat visdat TRUE
## ggplot2 ggplot2 TRUE
## ggrepel ggrepel TRUE
## ggdist ggdist TRUE
## gghalves gghalves TRUE
## patchwork patchwork TRUE
## ComplexHeatmap ComplexHeatmap TRUE
## circlize circlize TRUE
## dendextend dendextend TRUE
## table1 table1 TRUE
## reshape2 reshape2 TRUE
## fixest fixest TRUE
## tictoc tictoc TRUE
## lme4 lme4 TRUE
## lmerTest lmerTest TRUE
## broom.mixed broom.mixed TRUE
## effectsize effectsize TRUE
## modelsummary modelsummary TRUE
## multcomp multcomp TRUE
This section loads the core metrics datasets (post-level and topic-level) and the corresponding raw content data (post content and topic content/prompts). It then performs an initial merge to combine the metrics with the clean content for each level.
Two levels of dataset: The post-level dataset contains individual student posts, while the topic-level dataset aggregates each discussion topic’s overall content creation and social engagement indicators.
# Load core metrics datasets: the datasets with metrics measuring content creation and social engagement.
df_topic_metrics <- read.csv("data/df_topic_0904.csv")
df_post_metrics <- read.csv("data/df_post_0904.csv")
cat("✅ Loaded post-level and topic-level metrics data (CSV).\n")
## ✅ Loaded post-level and topic-level metrics data (CSV).
# Load content data (for both topics and posts): Topic content is discussion prompts, while post content is each individual post under discussion topics.
## Topic content: only select the discussion topics for which we have topic metrics.
topic_content <- read_parquet("data/discussion_topic_assignment_19_24_cleaned_content_only.parquet.gzip")
topic_content <- topic_content%>%subset(discussion_topic_id%in%df_topic_metrics$discussion_topic_id)
## Post content: only select the discussion posts for which we have post metrics.
post_content <- read_parquet("data/discussion_post_assignment_19_24_cleaned_content_only.parquet.gzip")
post_content <- post_content%>%subset(discussion_post_id%in%df_post_metrics$discussion_post_id)
cat("✅ Loaded and filtered topic and post content data (Parquet).\n")
## ✅ Loaded and filtered topic and post content data (Parquet).
# Merge content data with metrics data: connect discussion prompts and posts with metrics
## df_post: Post metrics joined with clean post content.
df_post <- left_join(df_post_metrics, post_content, by = "discussion_post_id")
## df_topic: Topic metrics joined with clean discussion topic prompt content.
df_topic <- left_join(df_topic_metrics, topic_content, by = "discussion_topic_id")
cat("✅ Merging complete. Created df_post and df_topic.\n")
## ✅ Merging complete. Created df_post and df_topic.
# Calculate percentage of missing values per column in df_topic
cat("--- Missing Value Percentage in df_topic: !!! Rows with missing 'discussion_topic_content_clean' will be removed later for modeling ---\n")
## --- Missing Value Percentage in df_topic: !!! Rows with missing 'discussion_topic_content_clean' will be removed later for modeling ---
# Define the common text representations of missing values you want to exclude
missing_content_placeholders <- c("", "NA", "N/A", "NULL", "No message")
# Enhanced check for all forms of missingness in the key content column
total_missing_count <- sum(
is.na(df_topic$discussion_topic_content_clean) |
df_topic$discussion_topic_content_clean %in% missing_content_placeholders
)
total_missing_percentage <- (total_missing_count / nrow(df_topic)) * 100
cat("--- Enhanced Missing Value Percentage in df_topic ---\n")
## --- Enhanced Missing Value Percentage in df_topic ---
cat("Total rows (topics):", nrow(df_topic), "\n")
## Total rows (topics): 7667
cat(sprintf("Total missing/empty/placeholder content rows (Count): %d\n", total_missing_count))
## Total missing/empty/placeholder content rows (Count): 432
cat(sprintf("Total missing/empty/placeholder content rows (Percentage): %.2f%%\n", total_missing_percentage))
## Total missing/empty/placeholder content rows (Percentage): 5.63%
# Count the number of unique values (check duplicates) for each variable in df_topic
cat("--- Unique Value Count in df_topic ---\n")
## --- Unique Value Count in df_topic ---
df_topic_unique_counts <- df_topic %>%
summarise(across(everything(), n_distinct))
print(df_topic_unique_counts)
## X discussion_topic_id avg_similarity embedding_variance semantic_spread
## 1 7667 7667 7662 7663 353
## MRS_depth thread_MRS_avg_depth post_count max_depth avg_depth max_width
## 1 6392 1900 333 5 1247 36
## avg_width total_users replied_users percent_replied avg_top_post_length
## 1 1245 214 118 958 5165
## discussion_topic_title discussion_topic_content_clean
## 1 4810 909
cat("Note: With 7667 topic IDs, having 909 unique content entries confirms many topics reuse the same discussion prompt.\n")
## Note: With 7667 topic IDs, having 909 unique content entries confirms many topics reuse the same discussion prompt.
# ---Inspecting df_post: get a sense of variables in df_post that might be useful--
# Calculate percentage of missing values per column in df_post
cat("--- Missing Value Percentage in df_post ---\n")
## --- Missing Value Percentage in df_post ---
na_percentage_post <- colSums(is.na(df_post)) / nrow(df_post) * 100
print(na_percentage_post[na_percentage_post > 0])
## parent_discussion_post_id user_group_id
## 54.8608849 49.8264595
## first_generation race
## 7.2504665 3.8248924
## gender term_code
## 0.3381953 3.4131880
## course_title school_code
## 3.4131880 100.0000000
## school_name_abbrev dept_code
## 100.0000000 100.0000000
## dept_name_abbrev course_level
## 3.4131880 100.0000000
## course_type online_course
## 3.4131880 100.0000000
## uptake parent_discussion_post_created_at
## 56.0076794 56.0752646
## response_time_gap
## 56.0752646
cat("Note: 1. These variables have no data: school_code, school_name_abbrev, dept_code, course_level, online_course. 2. The variables(50–57% na_values) primarily related to post interaction/response have missing values structurally induced by the post_type: parent_discussion_post_id, uptake, parent_discussion_post_created_at, response_time_gap, user_group_id. 3. Variable with low missingness (< 8%): first_generation, race, dept_name_abbrev, term_code, course_title, course_type, gender.\n")
## Note: 1. These variables have no data: school_code, school_name_abbrev, dept_code, course_level, online_course. 2. The variables(50–57% na_values) primarily related to post interaction/response have missing values structurally induced by the post_type: parent_discussion_post_id, uptake, parent_discussion_post_created_at, response_time_gap, user_group_id. 3. Variable with low missingness (< 8%): first_generation, race, dept_name_abbrev, term_code, course_title, course_type, gender.
cat("---!!! Need to handle missing values in course_type and dept_name_abbrev---\n")
## ---!!! Need to handle missing values in course_type and dept_name_abbrev---
# Count the number of unique values for each variable in df_post
cat("\n--- Unique Value Count in df_post ---\n")
##
## --- Unique Value Count in df_post ---
df_post_unique_counts <- df_post %>%
summarise(across(everything(), n_distinct))
print(df_post_unique_counts)
## X.1 X mellon_id canvas_course_id discussion_post_id
## 1 371383 371383 26474 746 371383
## parent_discussion_post_id discussion_post_created_at
## 1 91914 368113
## discussion_topic_posted_at discussion_topic_id user_group_id gpa_term
## 1 3389 7667 1081 333
## Analytic Clout Tone Authentic int_student citizenship_app first_generation
## 1 8560 8964 7119 8360 2 4 3
## low_income first_language race gender term class_size term_code course_title
## 1 3 31 6 3 8 180 9 196
## school_code school_name_abbrev dept_code dept_name_abbrev course_level
## 1 1 1 1 53 1
## course_type online_course informativeness politeness uptake formality
## 1 10 1 53674 460 153498 366046
## word_cnt parent_discussion_post_created_at post_time_gap response_time_gap
## 1 1444 89147 351511 162193
## first_post_created_at post_type GPT discussion_post_content_clean
## 1 7665 2 2 370881
RQ1: At the discussion topic level, how do students’ content creation and social engagement differ between cases where GPT use = 0 (no GenAI) and GPT use = 1 (GenAI used) Build formula with course-level controls + fixed effects for the full dataset formula <- as.formula( paste0(metric, ” ~ GPT_use + discussion_size_s + pct_international_s + pct_posts_s + course_type | discussion_topic_content_clean”)) RQ2: How do these impacts vary across different majors? Same formula as RQ1, but it will run through each field’s subset RQ3: How do these impacts vary across different discussion prompt types? Same formula as RQ1, but it will run through each discussion prompt’s subset
—Summary of key variables for next steps — Core Identifiers and Key Variables - ID: mellon_id (Student ID), discussion_topic_id (Grouping ID for aggregation: used to merge post-level and topic-level data) - Independent variable: GPT (0=No GPT time, 1=GPT time) - Post Metrics: Analytic, informativeness, word_cnt (These will be averaged per topic)
Control Variables - discussion_size: Number of unique students (mellon_id) within each discussion_topic_id → Calculate using df_post %>% group_by(discussion_topic_id) %>% summarise(discussion_size = n_distinct(mellon_id)) - course_type: Type of course (e.g., lecture, seminar) - int_student: Percentage of international students within each discussion topic → Calculate as mean(int_student == 1) per discussion_topic_id - post_type_ratio: Proportion of original posts relative to total posts (posts + replies) → pct_posts = number_of_posts / (number_of_posts + number_of_replies)
RQ2 dept_name_abbrev: will be used to subset fields
—All variables (for reference only) — - Content: discussion_post_content_clean - Course context variables: dept_name_abbrev, class_size, course_type, term, course_title - Identifiers: canvas_course_id, discussion_post_id, parent_discussion_post_id, , user_group_id, - Time-based variables: discussion_post_created_at, discussion_topic_posted_at, parent_discussion_post_created_at - Academic: gpa_term - Demographic Variables: int_student, citizenship_app, first_generation, low_income, first_language, race, gender. - Interaction: post_type, post_time_gap, response_time_gap, uptake, first_post_created_at - Content tone: Clout, Authentic, Tone, politeness, formality
# Create topic-level aggregated variables from post-level data
topic_aggregated <- df_post %>%
group_by(discussion_topic_id) %>%
summarise(
# Student and discussion characteristics
discussion_size = n_distinct(mellon_id),
pct_international = mean(int_student, na.rm = TRUE) * 100,
# Percentage of Original Posts: The count of "Post" type contributions divided by the total count of "Post" and "Reply" types.
pct_posts = 100 * sum(post_type == "Post", na.rm = TRUE) /
(sum(post_type == "Post", na.rm = TRUE) + sum(post_type == "Reply", na.rm = TRUE)),
# Content measures - averages per topic: Averaged measures of content length, depth, and width.
avg_word_count = mean(word_cnt, na.rm = TRUE),
avg_analytic = mean(Analytic, na.rm = TRUE),
avg_informativeness = mean(informativeness, na.rm = TRUE),
# GPT usage indicators
GPT_use = as.numeric(any(GPT == 1, na.rm = TRUE)),
# Course characteristics (taking first value since should be constant within topic).
## While course_type and dept_name_abbrev have missing values, we will handle that later in df_combined just in case some miss values for certain rows can be found in others with the same topic_id.
#class_size = first(class_size), # this is not necessary
course_type = first(course_type),
dept_name_abbrev = first(dept_name_abbrev),
.groups = 'drop' # Important: Removes the grouping structure after summarization
)
cat("✅ Successfully aggregated post-level data into 'topic_aggregated' dataframe.\n")
## ✅ Successfully aggregated post-level data into 'topic_aggregated' dataframe.
# Merge aggregated post-level metrics with topic-level data
df_combined <- left_join(topic_aggregated, df_topic, by = "discussion_topic_id")
cat("✅ Merged 'topic_aggregated' with 'df_topic'. The final analytical dataset is 'df_combined'.\n")
## ✅ Merged 'topic_aggregated' with 'df_topic'. The final analytical dataset is 'df_combined'.
cat(paste("Final Combined Dataset ('df_combined') has", nrow(df_combined), "rows (topics).\n"))
## Final Combined Dataset ('df_combined') has 7667 rows (topics).
# Define content creation indicators
content_metrics <- c(
"avg_similarity", "embedding_variance", "semantic_spread",
"MRS_depth","avg_word_count","avg_analytic", "avg_informativeness"
)
# Define social engagement indicators
social_metrics <- c(
"percent_replied", "avg_depth", "max_depth",
"avg_width", "max_width"
)
# Combine into one master list
all_metrics <- c(content_metrics, social_metrics)
cat("✅ Content and Social Metrics lists defined.\n")
## ✅ Content and Social Metrics lists defined.
# Create a list of histogram plots
plot_list <- map(all_metrics, ~{
ggplot(df_combined, aes(x = .data[[.x]])) +
geom_histogram(bins = 30, fill = "#0072B2", color = "white", alpha = 0.8) +
labs(
title = paste("Distribution of", .x),
x = .x,
y = "Count"
) +
theme_minimal(base_size = 10) +
theme(
plot.title = element_text(size = 5, face = "bold"),
axis.title.x = element_text(size = 6), axis.title.y = element_text(size = 6),
axis.text.x = element_text(angle = 45, hjust = 1, size = 6), axis.text.y = element_text(size = 6)
)
})
# Combine and display the plots using patchwork
# We will arrange them into 3 rows of 4 columns for a compact view.
combined_plot <- wrap_plots(plot_list, ncol = 4, nrow = 3)
# Print the combined plot
print(combined_plot)
cat("--- Summary Statistics for Key Metrics in df_combined ---\n")
## --- Summary Statistics for Key Metrics in df_combined ---
# Use 'all_of()' inside select to explicitly handle character vector of column names
# Use unclass() or as.data.frame() before print() to ensure the 'n' argument works
df_combined_summary <- df_combined %>%
dplyr::select(all_of(all_metrics)) %>%
pivot_longer(cols = everything(), names_to = "metric", values_to = "value") %>%
group_by(metric) %>%
summarise(
Mean = mean(value, na.rm = TRUE),
SD = sd(value, na.rm = TRUE),
Min = min(value, na.rm = TRUE),
Max = max(value, na.rm = TRUE),
Q99 = quantile(value, 0.99, na.rm = TRUE), # Check 99th percentile for outliers
.groups = 'drop'
)
# Convert to a standard data frame and print the result
print(df_combined_summary, n = 12)
## # A tibble: 12 × 6
## metric Mean SD Min Max Q99
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 MRS_depth 0.266 5.43e-2 0.168 0.484 0.393
## 2 avg_analytic 68.2 1.41e+1 13.4 98.4 94.5
## 3 avg_depth 1.39 2.54e-1 1 2.02 1.79
## 4 avg_informativeness 0.886 4.39e-2 0.701 1 1
## 5 avg_similarity 0.525 1.46e-1 0.0876 0.966 0.806
## 6 avg_width 0.383 2.50e-1 0 0.9 0.754
## 7 avg_word_count 178. 1.36e+2 11.1 2251. 763.
## 8 embedding_variance 0.00119 3.79e-4 0.0000824 0.00219 0.00199
## 9 max_depth 1.94 6.07e-1 1 5 3
## 10 max_width 3.11 3.20e+0 0 45 15
## 11 percent_replied 0.471 3.27e-1 0 1 0.938
## 12 semantic_spread 1.00 7.59e-8 1.000 1.00 1.00
# # Apply CONSTANT MANUAL SCALING to specific metrics
# df_combined <- df_combined %>%
# mutate(
# # 1. Scale avg_word_count: Divide by 1000 (New unit: thousands of words)
# avg_word_count = avg_word_count / 1000,
# # 2. Scale avg_analytic: Divide by 100 (New unit: hundredths of LIWC score)
# # The max raw value is ~98, so the max scaled value will be ~0.98.
# avg_analytic = avg_analytic / 100,
# # 3. Scale max_width: Divide by 10 (New unit: tenths of post count)
# # The max raw value is 45, so the max scaled value will be 4.5.
# # max_width = max_width / 10
# )
# Apply z-score standardization to all metrics
df_combined <- df_combined %>%
mutate(across(all_of(all_metrics), ~scale(.)[,1]))
# Check missing values in the intermediate combined dataset
cat("--- Missing Value Count in df_combined (Before Cleaning) ---\n")
## --- Missing Value Count in df_combined (Before Cleaning) ---
na_df_combined <- colSums(is.na(df_combined))
print(na_df_combined[na_df_combined > 0])
## course_type dept_name_abbrev
## 264 264
## discussion_topic_content_clean
## 392
cat("Note: Proceeding to clean missing values.\n")
## Note: Proceeding to clean missing values.
# Handle missing values
df_analysis <- df_combined %>%
# remove rows where discussion_topic_content_clean is NA
filter(
# Exclude standard NA values
!is.na(discussion_topic_content_clean) &
# Exclude text placeholders/empty strings
!(discussion_topic_content_clean %in% missing_content_placeholders)) %>%
mutate(
dept_name_abbrev = ifelse(is.na(dept_name_abbrev), "Unknown", dept_name_abbrev),
course_type = ifelse(is.na(course_type), "Unknown", course_type)
)
cat("✅ Rows remaining after filtering NA topic content:", nrow(df_analysis), "\n")
## ✅ Rows remaining after filtering NA topic content: 7235
cat("--- Missing Value Count in df_analysis (After Cleaning) ---\n")
## --- Missing Value Count in df_analysis (After Cleaning) ---
colSums_na_df_analysis <- colSums(is.na(df_analysis))
print(colSums_na_df_analysis[colSums_na_df_analysis > 0])
## named numeric(0)
# Should show no NAs for key variables
# Convert categorical variables to factors and Standardize numeric variables (z-score) for modeling or comparison
df_analysis_transformed <- df_analysis %>%
mutate(
# Factorize categorical variables
course_type = as.factor(course_type),
discussion_topic_content_clean = as.factor(discussion_topic_content_clean),
# Standardize continuous control variables (Z-score)
discussion_size = scale(discussion_size)[,1],
pct_international = scale(pct_international)[,1],
pct_posts = scale(pct_posts)[,1]
)
head(df_analysis_transformed)
## # A tibble: 6 × 27
## discussion_topic_id discussion_size pct_international pct_posts avg_word_count
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 593148 0.844 0.221 1.64 -0.325
## 2 593152 3.54 -0.243 1.59 -0.283
## 3 593154 3.34 -0.236 1.62 0.0721
## 4 593155 0.844 0.183 1.51 -0.209
## 5 596691 -0.268 0.169 1.64 0.0627
## 6 596692 -0.323 0.299 1.64 0.0544
## # ℹ 22 more variables: avg_analytic <dbl>, avg_informativeness <dbl>,
## # GPT_use <dbl>, course_type <fct>, dept_name_abbrev <chr>, X <int>,
## # avg_similarity <dbl>, embedding_variance <dbl>, semantic_spread <dbl>,
## # MRS_depth <dbl>, thread_MRS_avg_depth <dbl>, post_count <int>,
## # max_depth <dbl>, avg_depth <dbl>, max_width <dbl>, avg_width <dbl>,
## # total_users <int>, replied_users <int>, percent_replied <dbl>,
## # avg_top_post_length <dbl>, discussion_topic_title <chr>, …
cat("✅ Created 'df_analysis_field_transformed': Categorical variables factorized and controls standardized.\n")
## ✅ Created 'df_analysis_field_transformed': Categorical variables factorized and controls standardized.
cat(paste("Final Analytical Dataset ('df_analysis_field_transformed') has", nrow(df_analysis_transformed), "rows.\n"))
## Final Analytical Dataset ('df_analysis_field_transformed') has 7235 rows.
# Only factorize variables
df_analysis_factorized <- df_analysis %>%
mutate(
course_type = as.factor(course_type),
discussion_topic_content_clean = as.factor(discussion_topic_content_clean),
)
head(df_analysis_factorized)
## # A tibble: 6 × 27
## discussion_topic_id discussion_size pct_international pct_posts avg_word_count
## <int> <int> <dbl> <dbl> <dbl>
## 1 593148 59 22.0 100 -0.325
## 2 593152 156 13.2 98.7 -0.283
## 3 593154 149 13.3 99.3 0.0721
## 4 593155 59 21.3 96.7 -0.209
## 5 596691 19 21.1 100 0.0627
## 6 596692 17 23.5 100 0.0544
## # ℹ 22 more variables: avg_analytic <dbl>, avg_informativeness <dbl>,
## # GPT_use <dbl>, course_type <fct>, dept_name_abbrev <chr>, X <int>,
## # avg_similarity <dbl>, embedding_variance <dbl>, semantic_spread <dbl>,
## # MRS_depth <dbl>, thread_MRS_avg_depth <dbl>, post_count <int>,
## # max_depth <dbl>, avg_depth <dbl>, max_width <dbl>, avg_width <dbl>,
## # total_users <int>, replied_users <int>, percent_replied <dbl>,
## # avg_top_post_length <dbl>, discussion_topic_title <chr>, …
This step creates the field variable needed to subset the data for RQ2.
# Inspect Unique Departments
unique_dept <- unique(df_analysis$dept_name_abbrev)
cat("departments:", unique_dept, "\n")
## departments: COMLIT SOCSCI FRENCH Unknown PHRMSCI MGMT WRITING ACENG MUSIC PUBHLTH LSCI CRM/LAW ECON PHILOS ENGLISH PSCI EDUC HISTORY DRAMA ASIANAM UNIAFF NURSCI COMPSCI GERMAN PHYSICS SOCIOL FLM&MDA BIOSCI ART ENGR ARABIC DANCE POLSCI SOCECOL I&CSCI UNISTU EAS EARTHSS SPANISH UPPP ENGRMAE IN4MATX HUMAN CHC/LAT PSYCH INTLST SPPS MATH
# Create 'field' Variables: Language, Art, Humanities, Social Science, Theoretical STEM, Applied STEM, Others
df_analysis_field <- df_analysis_transformed %>%
mutate(field = case_when(
dept_name_abbrev %in% c("FRENCH","ENGLISH","GERMAN","ARABIC","SPANISH", "ACENG") ~ "Language",
dept_name_abbrev %in% c("MUSIC","ART","DRAMA","DANCE") ~ "Arts",
dept_name_abbrev %in% c("COMLIT","FLM&MDA","WRITING","PHILOS","HISTORY",
"ASIANAM","EAS","HUMAN") ~ "Humanities",
dept_name_abbrev %in% c("CHC/LAT","SOCSCI","CRM/LAW","ANTHRO","ECON","PSCI",
"SOCIOL","POLSCI","SOCECOL","UPPP","INTLST","LSCI",
"PSYCH","SPPS","MGMT","EDUC") ~ "Social Sciences",
dept_name_abbrev %in% c("PHYSICS","EARTHSS","MATH","BIOSCI") ~ "Theoretical STEM",
dept_name_abbrev %in% c("COMPSCI","I&CSCI","IN4MATX","ENGR","ENGRMAE",
"PHRMSCI","PUBHLTH","NURSCI") ~ "Applied STEM",
dept_name_abbrev %in% c("UNIAFF","UNISTU") ~ "Others",
TRUE ~ "Unknown" # catch-all for missing or uncategorized departments
))
cat("✅ Created field classification variable 'field'.\n")
## ✅ Created field classification variable 'field'.
df_analysis_field <- df_analysis_field %>%
mutate(field = as.factor(field))
# Split the data frame dept_name_abbrev into a list of data frames.
field_subsets_all <- split(df_analysis_field, df_analysis_field$field)
# Will drop Unknown and Others field subset from this list
field_subsets <- field_subsets_all[names(field_subsets_all) != "Unknown" & names(field_subsets_all) != "Others"]
# To view the names of all the created subsets:
cat("Subset Names (for iteration in regression):", names(field_subsets),"\n")
## Subset Names (for iteration in regression): Applied STEM Arts Humanities Language Social Sciences Theoretical STEM
# Confirming the names of the subsets created
cat("✅ Data successfully split into", length(field_subsets), "subsets.\n")
## ✅ Data successfully split into 6 subsets.
# Check Subset Sizes: Calculate the number of topics (rows) in each field subset.
subset_sizes <- sapply(field_subsets, nrow)
# Print the table of subset sizes for review.
cat("Field Subset Sizes (Number of Discussion Topics)\n")
## Field Subset Sizes (Number of Discussion Topics)
print(subset_sizes)
## Applied STEM Arts Humanities Language
## 2056 133 1061 203
## Social Sciences Theoretical STEM
## 3414 73
# Assign individual subsets to new variables (#Language, Art, Humanities, Social Science, Theoretical STEM, Applied STEM, Others)
# list2env(field_subsets, envir = .GlobalEnv)
# language_df <- field_subsets$`Language`
# art_df <- field_subsets$`Art`
# humanities_df <- field_subsets$`Humanities`
# social_sciences_df <- field_subsets$`Social Sciences`
# theoretical_stem_df <- field_subsets$`Theoretical STEM`
# applied_stem_df <- field_subsets$`Applied STEM`
# others_df <- field_subsets$`Others`
# head(language_df)
—For RQ 3, we classified discussion prompts into different types using multiple frameworks. 1) knowledge types: “Metacognitive”, “Procedural”, “Conceptual”, “Factual”; 2) cognitive types: “Create”, “Apply”, “Evaluate”, “Analyze”, “Remember”, “Understand”; 3) purpose types: “academic”, “social”, “logistics”; 4) academic types: “knowledge_reproduction”, “open_qa”, “synthesis_application”, “personal_connection”, “collaborative_interaction” —
# Load data with knowledge and cognitive types
df_knowledge_cognitive <- read.csv("data/knowledge_cognitive.csv")
cat("✅ Data successfully loaded! Columns available:\n")
## ✅ Data successfully loaded! Columns available:
print(names(df_knowledge_cognitive))
## [1] "Unnamed..0" "discussion_topic_id"
## [3] "discussion_topic_title" "discussion_topic_content_clean"
## [5] "Pred_Knowledge_Type" "Pred_Cognitive_Process"
cat("columns discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process will be added to main dataset\n")
## columns discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process will be added to main dataset
# Display unique values in the "Pred_Knowledge_Type" column
unique(df_knowledge_cognitive$Pred_Knowledge_Type)
## [1] "Metacognitive" "Procedural" "Conceptual" "Factual"
## [5] ""
# Display unique values in the "Pred_Cognitive_Process" column
unique(df_knowledge_cognitive$Pred_Cognitive_Process)
## [1] "Create" "Apply" "Evaluate" "Analyze" "Remember"
## [6] "Understand" ""
# Given we see "" from unique value check, Check overall missingness before cleaning
# Define the common text representations of missing values
missing_strings <- c("", "NA", "N/A", "NULL", "Unknown", "No message")
# Enhanced check covering NA, empty strings, and common missing text
cat("=== Enhanced Missing Value Summary (All Forms) ===\n")
## === Enhanced Missing Value Summary (All Forms) ===
enhanced_missing_summary <- sapply(
df_knowledge_cognitive[, c("Pred_Knowledge_Type", "Pred_Cognitive_Process", "discussion_topic_content_clean")], function(x) sum(is.na(x) | x %in% missing_strings)
)
print(enhanced_missing_summary)
## Pred_Knowledge_Type Pred_Cognitive_Process
## 432 432
## discussion_topic_content_clean
## 432
cat("=== Clean data and Validation ===\n")
## === Clean data and Validation ===
# Clean the dataframe: Remove rows with missing_strings.
df_knowledge_cognitive_clean <- df_knowledge_cognitive %>%
filter(
# Check for non-missing/non-placeholder in Pred_Knowledge_Type
!(Pred_Knowledge_Type %in% missing_strings))
# Validation: Check for remaining missing values in the cleaned subset
enhanced_missing_summary_clean <- sapply(
df_knowledge_cognitive_clean[, c("Pred_Knowledge_Type", "Pred_Cognitive_Process", "discussion_topic_content_clean")],
function(x) sum(is.na(x) | x %in% missing_strings | x == "")
)
# Should show 0 for the key columns
print(enhanced_missing_summary_clean)
## Pred_Knowledge_Type Pred_Cognitive_Process
## 0 0
## discussion_topic_content_clean
## 0
cat("=== Validation: No missing values confirmed ===\n")
## === Validation: No missing values confirmed ===
# Select only the necessary columns from df_knowledge_cognitive_clean data
df_prompt_types_to_merge <- df_knowledge_cognitive_clean %>%
dplyr::select(discussion_topic_id, Pred_Knowledge_Type, Pred_Cognitive_Process)
# Left join these columns to the main analytical dataframe
df_analysis_prompt_types <- left_join(
df_analysis_field,
df_prompt_types_to_merge,
by = "discussion_topic_id"
)
# Factorize the new categorical columns for consistency
df_analysis_prompt_types <- df_analysis_prompt_types %>%
mutate(
Pred_Knowledge_Type = as.factor(Pred_Knowledge_Type),
Pred_Cognitive_Process = as.factor(Pred_Cognitive_Process)
)
cat("✅ Merged 'Pred_Knowledge_Type' and 'Pred_Cognitive_Process' into 'df_analysis_prompt_types'.\n")
## ✅ Merged 'Pred_Knowledge_Type' and 'Pred_Cognitive_Process' into 'df_analysis_prompt_types'.
cat(paste("New dataset has", nrow(df_analysis_prompt_types), "rows.\n"))
## New dataset has 7235 rows.
# Define the levels based on the research question
knowledge_levels <- c("Metacognitive", "Procedural", "Conceptual", "Factual")
cognitive_levels <- c("Create", "Apply", "Evaluate", "Analyze", "Remember", "Understand")
# --- A. Create Knowledge Subsets ---
# Split the filtered data frame into a list of data frames based on Pred_Knowledge_Type
knowledge_subsets <- split(df_analysis_prompt_types, df_analysis_prompt_types$Pred_Knowledge_Type)
# --- B. Create Cognitive Subsets ---
# Split the filtered data frame into a list of data frames based on Pred_Cognitive_Process
cognitive_subsets <- split(df_analysis_prompt_types, df_analysis_prompt_types$Pred_Cognitive_Process)
# --- Validation: Check Subset Sizes ---
cat("Knowledge Subset Sizes (Number of Discussion Topics)\n")
## Knowledge Subset Sizes (Number of Discussion Topics)
knowledge_subset_sizes <- sapply(knowledge_subsets, nrow)
print(knowledge_subset_sizes)
## Conceptual Factual Metacognitive Procedural
## 3803 335 2874 223
cat("Cognitive Subset Sizes (Number of Discussion Topics)\n")
## Cognitive Subset Sizes (Number of Discussion Topics)
cognitive_subset_sizes <- sapply(cognitive_subsets, nrow)
print(cognitive_subset_sizes)
## Analyze Apply Create Evaluate Remember Understand
## 1160 214 1579 3482 74 726
cat("\n✅ Created 'knowledge_subsets' (4 subsets) and 'cognitive_subsets' (6 subsets).\n")
##
## ✅ Created 'knowledge_subsets' (4 subsets) and 'cognitive_subsets' (6 subsets).
# Load and Process Purpose-Type Data
df_purpose_type <- read.csv("data/purpose_type.csv")
cat("✅ Purpose-type data loaded successfully! Columns available:\n")
## ✅ Purpose-type data loaded successfully! Columns available:
print(names(df_purpose_type))
## [1] "discussion_topic_id" "QUESTION" "Bloom_Level"
## [4] "cleaned_text" "academic" "social"
## [7] "logistics"
# Enhanced check covering NA, empty strings, and common missing text
cat("=== Enhanced Missing Value Summary (All Forms) ===\n")
## === Enhanced Missing Value Summary (All Forms) ===
enhanced_missing_summary1 <- sapply(
df_purpose_type[, c("academic", "social", "logistics", "cleaned_text")],
function(x) sum(is.na(x) | x %in% missing_strings)
)
print(enhanced_missing_summary1)
## academic social logistics cleaned_text
## 0 0 0 432
# df_analysis_field has 7235 obervations, and df_purpose_type has 7667 observations. Both dataframes has discussion_topic_id that can be used to do merger. I want to 1) add three columns (academic", "social", "logistics) from df_purpose_type to df_analysis_field; 2) name the updated dataframe as df_analysis_purposetype; 3) check the size of df_analysis_purposetype; 4) Create subsets for academic, social, and logistics (value == 1) to ensure things liks the academic_subset only has rows with academic == 1; 5) check the size of each subset.
# 0. Select only the ID and the three purpose type columns from df_purpose_type
df_purpose_cols <- df_purpose_type %>%
dplyr::select(discussion_topic_id, academic, social, logistics)
# 1. Add three columns from df_purpose_cols to df_analysis_field using a left join
df_analysis_purposetype <- left_join(df_analysis_field, df_purpose_cols, by = "discussion_topic_id")
# 2. Check the size of df_analysis_purposetype
cat("✅ Merged dataset 'df_analysis_purposetype' created.\n")
## ✅ Merged dataset 'df_analysis_purposetype' created.
cat("3. Size of df_analysis_purposetype:", nrow(df_analysis_purposetype), "rows.\n")
## 3. Size of df_analysis_purposetype: 7235 rows.
cat("Note: The number of rows remains 7235, matching df_analysis_field.\n")
## Note: The number of rows remains 7235, matching df_analysis_field.
# 4. Create subsets for academic, social, and logistics (value == 1)
academic_subsets <- df_analysis_purposetype %>% filter(academic == 1)
social_subsets <- df_analysis_purposetype %>% filter(social == 1)
logistics_subsets <- df_analysis_purposetype %>% filter(logistics == 1)
# 5. Check the size of each subset
cat("✅ Purpose-type subsets created successfully.\n")
## ✅ Purpose-type subsets created successfully.
cat("Academic subset rows: ", nrow(academic_subsets), "\n")
## Academic subset rows: 5938
cat("Social subset rows: ", nrow(social_subsets), "\n")
## Social subset rows: 875
cat("Logistics subset rows:", nrow(logistics_subsets), "\n")
## Logistics subset rows: 1159
cat("Verification: Checking if 'academic_subset' only contains rows where academic == 1:\n")
## Verification: Checking if 'academic_subset' only contains rows where academic == 1:
unique(academic_subsets$academic)
## [1] 1
unique(social_subsets$social)
## [1] 1
unique(logistics_subsets$logistics)
## [1] 1
cat("Note: confirmed\n")
## Note: confirmed
cat("\n🎉 All subsets for Purpose Types created and verified successfully!\n")
##
## 🎉 All subsets for Purpose Types created and verified successfully!
# Load and Process Academic-Type Data
df_academic_type <- read.csv("data/academic_type.csv")
# Rename columns for cleaner, shorter names
df_academic_type <- df_academic_type %>%
rename(
knowledge_reproduction = annotator_openai_knowledge_reproduction,
open_qa = annotator_openai_open_qa,
synthesis_application = annotator_openai_synthesis_application,
personal_connection = annotator_openai_personal_connection,
collaborative_interaction = annotator_openai_collaborative_interaction
)
cat("✅ Column names renamed successfully:\n")
## ✅ Column names renamed successfully:
print(names(df_academic_type))
## [1] "Unnamed..0" "discussion_topic_id"
## [3] "discussion_topic_title" "discussion_topic_content_clean"
## [5] "QUESTION" "Bloom_Level"
## [7] "cleaned_text" "academic"
## [9] "social" "logistics"
## [11] "openai_raw_output" "knowledge_reproduction"
## [13] "open_qa" "synthesis_application"
## [15] "personal_connection" "collaborative_interaction"
# Check for missing values in key columns
cat("=== Missing Value Summary (Academic-Type Data) ===\n")
## === Missing Value Summary (Academic-Type Data) ===
missing_summary_academic <- sapply(
df_academic_type[, c(
"knowledge_reproduction",
"open_qa",
"synthesis_application",
"personal_connection",
"collaborative_interaction"
)],
function(x) sum(is.na(x))
)
print(missing_summary_academic)
## knowledge_reproduction open_qa synthesis_application
## 0 0 0
## personal_connection collaborative_interaction
## 0 0
# === Merge df_academic_type with df_analysis_field ===
# 0. Select only the ID and the three purpose type columns from df_purpose_type
df_academic_cols <- df_academic_type %>%
dplyr::select(discussion_topic_id,
knowledge_reproduction,
open_qa,
synthesis_application,
personal_connection,
collaborative_interaction)
# 1. Add the five academic columns to df_analysis_field using a left join
df_analysis_academictype <- left_join(df_analysis_field, df_academic_cols, by = "discussion_topic_id")
# 2. Check the size of df_analysis_academictype
cat("✅ Merged dataset 'df_analysis_academictype' created.\n")
## ✅ Merged dataset 'df_analysis_academictype' created.
cat("3. Size of df_analysis_academictype:", nrow(df_analysis_academictype), "rows.\n")
## 3. Size of df_analysis_academictype: 7235 rows.
cat("Note: The number of rows remains 7235, matching df_analysis_field.\n")
## Note: The number of rows remains 7235, matching df_analysis_field.
# 4. Create subsets for academic types (value == 1)
knowledge_reproduction_subsets <- df_analysis_academictype %>% filter(knowledge_reproduction == 1)
open_qa_subsets <- df_analysis_academictype %>% filter(open_qa == 1)
synthesis_application_subsets <- df_analysis_academictype %>% filter(synthesis_application == 1)
personal_connection_subsets <- df_analysis_academictype %>% filter(personal_connection == 1)
collaborative_interaction_subsets <- df_analysis_academictype %>% filter(collaborative_interaction == 1)
# 5. Check the size of each subset
cat("✅ Academic-type subsets created successfully.\n")
## ✅ Academic-type subsets created successfully.
cat("knowledge_reproduction subset rows: ", nrow(knowledge_reproduction_subsets), "\n")
## knowledge_reproduction subset rows: 165
cat("open_qa_subset rows: ", nrow(open_qa_subsets), "\n")
## open_qa_subset rows: 176
cat("synthesis_application_subset rows:", nrow(synthesis_application_subsets), "\n")
## synthesis_application_subset rows: 3784
cat("personal_connection_subset rows: ", nrow(personal_connection_subsets), "\n")
## personal_connection_subset rows: 3427
cat("collaborative_interaction_subset rows:", nrow(collaborative_interaction_subsets), "\n")
## collaborative_interaction_subset rows: 3925
# cat("Verification: Checking if 'collaborative_interaction_subset' only contains rows where collaborative_interaction == 1:\n")
# collaborative_check <- table(personal_connection_subsets$personal_connection)
# collaborative_check <- table(personal_connection_subsets$personal_connection)
#
# cat("Note: The table confirms that all rows in the subset have the required value of 1.\n")
# cat("🎉 All academic-type subsets created and verified successfully!\n")
# Combine the knowledge and cognitive subsets into a list
kc_subsets_list <- list(
Conceptual = knowledge_subsets$Conceptual,
Factual = knowledge_subsets$Factual,
Metacognitive = knowledge_subsets$Metacognitive,
Procedural = knowledge_subsets$Procedural,
Remember = cognitive_subsets$Remember,
Understand = cognitive_subsets$Understand,
Analyze = cognitive_subsets$Analyze,
Apply = cognitive_subsets$Apply,
Evaluate = cognitive_subsets$Evaluate,
Create = cognitive_subsets$Create
)
# Combine the individual purpose-type subsets into a list
purpose_subsets_list <- list(
academic_purpose = academic_subsets,
social_purpose = social_subsets,
logistics_purpose = logistics_subsets
)
# Combine the individual academic-type subsets into a list
academic_subsets_list <- list(
knowledge_reproduction = knowledge_reproduction_subsets,
open_qa = open_qa_subsets,
synthesis_application = synthesis_application_subsets,
personal_connection = personal_connection_subsets,
collaborative_interaction = collaborative_interaction_subsets
)
# Combine both lists into a single named list for easy iteration in RQ3
rq3_subsets_combined <- c(kc_subsets_list, purpose_subsets_list, academic_subsets_list)
cat("✅ Combined all purpose and academic subsets into 'rq3_subsets_combined'.\n")
## ✅ Combined all purpose and academic subsets into 'rq3_subsets_combined'.
cat("--- Structure of the Combined RQ3 Subsets ---\n")
## --- Structure of the Combined RQ3 Subsets ---
cat("Total number of subsets:", length(rq3_subsets_combined), "\n")
## Total number of subsets: 18
cat("Names of the subsets (for iteration):\n")
## Names of the subsets (for iteration):
print(names(rq3_subsets_combined))
## [1] "Conceptual" "Factual"
## [3] "Metacognitive" "Procedural"
## [5] "Remember" "Understand"
## [7] "Analyze" "Apply"
## [9] "Evaluate" "Create"
## [11] "academic_purpose" "social_purpose"
## [13] "logistics_purpose" "knowledge_reproduction"
## [15] "open_qa" "synthesis_application"
## [17] "personal_connection" "collaborative_interaction"
# Initialize results dataframe
results_df <- data.frame(
metric = character(),
coef_GPT = numeric(),
se_GPT = numeric(),
t_value = numeric(),
p_value = numeric(),
sigsig = character(),
stringsAsFactors = FALSE
)
# Loop through all metrics
for (metric in all_metrics) {
# Build formula with course-level controls + fixed effects
formula <- as.formula(
paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts + course_type | discussion_topic_content_clean")
)
# Run regression with fixed effects
model <- feols(formula, data = df_analysis_field)
# Extract GPT coefficient info
coef_summary <- summary(model)$coeftable
gpt_row <- coef_summary["GPT_use", ]
# Determine significance
sig <- case_when(
gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
gpt_row["Pr(>|t|)"] <= 0.01 ~ "**",
gpt_row["Pr(>|t|)"] <= 0.05 ~ "*",
TRUE ~ ""
)
# Add to results dataframe
results_df <- results_df %>%
add_row(
metric = metric,
coef_GPT = gpt_row["Estimate"],
se_GPT = gpt_row["Std. Error"],
t_value = gpt_row["t value"],
p_value = gpt_row["Pr(>|t|)"],
sigsig = sig
)
}
# View summary table
results_df
## metric coef_GPT se_GPT t_value p_value
## 1 avg_similarity 0.0071345695 0.012862353 0.55468618 5.792464e-01
## 2 embedding_variance -0.0004597206 0.012500783 -0.03677535 9.706722e-01
## 3 semantic_spread -0.0485844277 0.023402998 -2.07599159 3.817657e-02
## 4 MRS_depth 0.0947692161 0.025907924 3.65792394 2.689503e-04
## 5 avg_word_count 0.0311344768 0.013929973 2.23507088 2.565600e-02
## 6 avg_analytic 0.1739144289 0.024718008 7.03594034 3.905403e-12
## 7 avg_informativeness 0.0054446573 0.027698550 0.19656832 8.442095e-01
## 8 percent_replied 0.0513754730 0.008865892 5.79473258 9.442316e-09
## 9 avg_depth 0.0043794416 0.003437259 1.27410858 2.029520e-01
## 10 max_depth 0.0266836813 0.016604079 1.60705576 1.083909e-01
## 11 avg_width 0.0032933809 0.003078169 1.06991554 2.849426e-01
## 12 max_width -0.0543230773 0.012907656 -4.20859357 2.825887e-05
## sigsig
## 1
## 2
## 3 *
## 4 ***
## 5 *
## 6 ***
## 7
## 8 ***
## 9
## 10
## 11
## 12 ***
# Define the custom order for the Y-axis metrics
custom_metric_order <- c(
"avg_similarity",
"embedding_variance",
"semantic_spread",
"avg_word_count",
"MRS_depth",
"avg_analytic",
"avg_informativeness",
"percent_replied",
"avg_depth",
"max_depth",
"avg_width",
"max_width"
)
# Add a category variable for coloring the metrics
results_df <- results_df %>%
mutate(
category = case_when(
metric %in% content_metrics ~ "Content Creation",
metric %in% social_metrics ~ "Social Engagement"),
# Calculate lower and upper confidence bounds (using t-value * SE for ~95% CI)
lower_ci = coef_GPT - (1.96 * se_GPT),
upper_ci = coef_GPT + (1.96 * se_GPT),
# Reorder metrics based on coefficient value within each category
metric_ordered = factor(metric, levels = rev(custom_metric_order)))
# Create the Forest Plot
ggplot(results_df, aes(x = coef_GPT, y = metric_ordered, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2) +
# Plot confidence intervals
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.25) +
# Plot the point estimate (coefficient)
geom_point(size = 0.6) +
# Add significance stars next to the points
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(results_df$coef_GPT > 0, -0.5, 1.5), # Adjust position based on coefficient sign
vjust = 0.5, size = 2) +
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
labs(
title = "RQ1: Overall Effect of GPT Use on Discussion Metrics",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_minimal(base_size = 9) +
theme(
legend.position = "bottom",
plot.title.position = "plot"
)
# Initialize a list to store results for each field
results_list <- list()
# Loop through the list of dataframes
for (field_name in names(field_subsets)) {
rq2_df <- field_subsets[[field_name]]
# Initialize a results dataframe for the current field
results_field_df <- data.frame(
metric = character(),
coef_GPT = numeric(),
se_GPT = numeric(),
t_value = numeric(),
p_value = numeric(),
sigsig = character(),
stringsAsFactors = FALSE
)
# Loop through all metrics for the current dataframe
for (metric in all_metrics) {
# Build formula with fixed effects
formula <- as.formula(
paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts | discussion_topic_content_clean")
)
# Run regression on the current dataframe
model <- feols(formula, data = rq2_df)
# Extract GPT coefficient info
coef_summary <- summary(model)$coeftable
# Check if 'GPT_use' exists in the model results
if ("GPT_use" %in% rownames(coef_summary)) {
gpt_row <- coef_summary["GPT_use", ]
# Determine significance
sig <- case_when(
gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
gpt_row["Pr(>|t|)"] <= 0.01 ~ "**",
gpt_row["Pr(>|t|)"] <= 0.05 ~ "*",
TRUE ~ ""
)
# Add to the current field's results dataframe
results_field_df <- results_field_df %>%
add_row(
metric = metric,
coef_GPT = gpt_row["Estimate"],
se_GPT = gpt_row["Std. Error"],
t_value = gpt_row["t value"],
p_value = gpt_row["Pr(>|t|)"],
sigsig = sig
)
}
}
# Store the complete results for the current field in the main list
results_list[[field_name]] <- results_field_df
}
all_results <- bind_rows(results_list, .id = "field_name")
print(all_results)
## field_name metric coef_GPT se_GPT t_value
## 1 Applied STEM avg_similarity -0.0451781917 0.029415848 -1.535845280
## 2 Applied STEM embedding_variance 0.0432741052 0.029320755 1.475886454
## 3 Applied STEM semantic_spread -0.0202254652 0.045054367 -0.448912429
## 4 Applied STEM MRS_depth 0.1603079561 0.069937815 2.292149901
## 5 Applied STEM avg_word_count 0.1293759891 0.031304842 4.132778883
## 6 Applied STEM avg_analytic 0.3938340107 0.065073421 6.052148545
## 7 Applied STEM avg_informativeness 0.1062934315 0.049587284 2.143562299
## 8 Applied STEM percent_replied 0.0731801071 0.017189179 4.257335863
## 9 Applied STEM avg_depth 0.0074884721 0.010000979 0.748773921
## 10 Applied STEM max_depth 0.0540319267 0.028950341 1.866365801
## 11 Applied STEM avg_width 0.0004911200 0.006928092 0.070888210
## 12 Applied STEM max_width -0.0541088286 0.028071327 -1.927547946
## 13 Arts avg_similarity 0.0229993517 0.037901749 0.606815048
## 14 Arts embedding_variance -0.0248713903 0.037954599 -0.655293200
## 15 Arts semantic_spread -0.3084767990 0.183636064 -1.679826894
## 16 Arts MRS_depth 0.0626189614 0.041349575 1.514379822
## 17 Arts avg_word_count 0.0311770837 0.037722031 0.826495369
## 18 Arts avg_analytic -0.0200303371 0.054780096 -0.365649908
## 19 Arts avg_informativeness -0.1696486582 0.187796392 -0.903364842
## 20 Arts percent_replied -0.0040963626 0.039151548 -0.104628367
## 21 Arts avg_depth -0.0154325271 0.016288457 -0.947451743
## 22 Arts max_depth -0.0662494006 0.106966786 -0.619345528
## 23 Arts avg_width -0.0526399831 0.015203499 -3.462359668
## 24 Arts max_width -0.1387845732 0.069098458 -2.008504636
## 25 Humanities avg_similarity 0.0248728341 0.030130724 0.825497394
## 26 Humanities embedding_variance -0.0283353499 0.028301466 -1.001197262
## 27 Humanities semantic_spread -0.0132071243 0.059952489 -0.220293177
## 28 Humanities MRS_depth -0.0694039210 0.042453703 -1.634814314
## 29 Humanities avg_word_count -0.0467145194 0.031591304 -1.478714481
## 30 Humanities avg_analytic 0.1292423038 0.024245835 5.330495134
## 31 Humanities avg_informativeness 0.0189776147 0.059270979 0.320183927
## 32 Humanities percent_replied -0.0111851116 0.014109603 -0.792730427
## 33 Humanities avg_depth -0.0021413971 0.010487188 -0.204191739
## 34 Humanities max_depth -0.0018485530 0.039068429 -0.047315774
## 35 Humanities avg_width -0.0051153325 0.007998829 -0.639510201
## 36 Humanities max_width -0.0335449066 0.028234478 -1.188083119
## 37 Language avg_similarity -0.0134422239 0.032158866 -0.417994334
## 38 Language embedding_variance 0.0209997512 0.031171493 0.673684476
## 39 Language semantic_spread -0.1283748419 0.161132511 -0.796703542
## 40 Language MRS_depth 0.0028539730 0.005719933 0.498952133
## 41 Language avg_word_count -0.0207429452 0.017883345 -1.159902982
## 42 Language avg_analytic 0.0765590185 0.047931733 1.597251221
## 43 Language avg_informativeness 0.1329565333 0.103774961 1.281200508
## 44 Language percent_replied 0.1670353554 0.045623930 3.661134777
## 45 Language avg_depth 0.0625360938 0.027683343 2.258979086
## 46 Language max_depth -0.0092794308 0.109647396 -0.084629742
## 47 Language avg_width 0.1135685308 0.047990509 2.366478998
## 48 Language max_width -0.0605862151 0.092633361 -0.654043152
## 49 Social Sciences avg_similarity 0.0188464465 0.018132359 1.039381942
## 50 Social Sciences embedding_variance -0.0058153596 0.017557675 -0.331214666
## 51 Social Sciences semantic_spread -0.0587071667 0.036275358 -1.618375928
## 52 Social Sciences MRS_depth 0.1175321049 0.027159024 4.327552656
## 53 Social Sciences avg_word_count -0.0036722857 0.016101437 -0.228071923
## 54 Social Sciences avg_analytic 0.1005016303 0.018570501 5.411896504
## 55 Social Sciences avg_informativeness -0.0230039549 0.039638296 -0.580346714
## 56 Social Sciences percent_replied 0.0555110856 0.014446916 3.842417674
## 57 Social Sciences avg_depth 0.0001907704 0.003982519 0.047901950
## 58 Social Sciences max_depth 0.0319565218 0.026352112 1.212674043
## 59 Social Sciences avg_width 0.0011974120 0.003787414 0.316155513
## 60 Social Sciences max_width -0.0879287330 0.021658745 -4.059733580
## 61 Theoretical STEM avg_similarity -0.1437075925 0.058897458 -2.439962575
## 62 Theoretical STEM embedding_variance 0.1578668023 0.056647041 2.786849940
## 63 Theoretical STEM semantic_spread 0.0572408250 0.302052397 0.189506276
## 64 Theoretical STEM MRS_depth -0.0068639900 0.015804455 -0.434307281
## 65 Theoretical STEM avg_word_count -0.0191746204 0.020657047 -0.928236272
## 66 Theoretical STEM avg_analytic -0.0953836522 0.060928702 -1.565496214
## 67 Theoretical STEM avg_informativeness -0.2056456370 0.152018920 -1.352763443
## 68 Theoretical STEM percent_replied 0.1489499932 0.038597390 3.859069052
## 69 Theoretical STEM avg_depth 0.0072973534 0.022877205 0.318979232
## 70 Theoretical STEM max_depth 0.3319884273 0.260907112 1.272439164
## 71 Theoretical STEM avg_width -0.0183388395 0.017957736 -1.021222259
## 72 Theoretical STEM max_width -0.0008636750 0.094105723 -0.009177709
## p_value sigsig
## 1 1.280084e-01
## 2 1.433889e-01
## 3 6.545505e-01
## 4 2.417766e-02 *
## 5 7.895583e-05 ***
## 6 3.057902e-08 ***
## 7 3.470570e-02 *
## 8 4.979934e-05 ***
## 9 4.559036e-01
## 10 6.517573e-02
## 11 9.436406e-01
## 12 5.699626e-02
## 13 5.473195e-01
## 14 5.159391e-01
## 15 1.005965e-01
## 16 1.376013e-01
## 17 4.133072e-01
## 18 7.165061e-01
## 19 3.716095e-01
## 20 9.171809e-01
## 21 3.489597e-01
## 22 5.391154e-01
## 23 1.265993e-03 **
## 24 5.120704e-02
## 25 4.099715e-01
## 26 3.178166e-01
## 27 8.258444e-01
## 28 1.034988e-01
## 29 1.406278e-01
## 30 2.394642e-07 ***
## 31 7.491287e-01
## 32 4.287775e-01
## 33 8.383899e-01
## 34 9.623039e-01
## 35 5.231475e-01
## 36 2.360645e-01
## 37 6.771299e-01
## 38 5.025557e-01
## 39 4.281051e-01
## 40 6.192543e-01
## 41 2.497193e-01
## 42 1.143596e-01
## 43 2.040188e-01
## 44 4.615549e-04 ***
## 45 2.675177e-02 *
## 46 9.327783e-01
## 47 2.050623e-02 *
## 48 5.150583e-01
## 49 2.992209e-01
## 50 7.406464e-01
## 51 1.063270e-01
## 52 1.883692e-05 ***
## 53 8.197005e-01
## 54 1.046565e-07 ***
## 55 5.619899e-01
## 56 1.405126e-04 ***
## 57 9.618170e-01
## 58 2.259313e-01
## 59 7.520404e-01
## 60 5.853660e-05 ***
## 61 2.022697e-02 *
## 62 8.757261e-03 **
## 63 8.508569e-01
## 64 6.668919e-01
## 65 3.600251e-01
## 66 1.270069e-01
## 67 1.853278e-01
## 68 5.010349e-04 ***
## 69 7.517526e-01
## 70 2.121100e-01
## 71 3.145778e-01
## 72 9.927326e-01
# Prepare the data for plotting
all_results_plot <- all_results %>%
mutate(
category = case_when(
metric %in% content_metrics ~ "Content Creation",
metric %in% social_metrics ~ "Social Engagement"),
lower_ci = coef_GPT - (1.96 * se_GPT),
upper_ci = coef_GPT + (1.96 * se_GPT),
metric = factor(metric, levels = rev(custom_metric_order)),
# Ensure field is a factor in the desired order (optional)
field_name = factor(field_name, levels = c("Applied STEM", "Theoretical STEM", "Social Sciences", "Humanities", "Language", "Arts"))
)
# Create the Grouped Forest Plot
ggplot(all_results_plot, aes(x = coef_GPT, y = metric, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2) +
# Plot confidence intervals and coefficient
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.15) +
geom_point(size = 0.12) +
# Add significance stars
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(all_results_plot$coef_GPT > 0, -0.2, 1.2),
vjust = 0.5, size = 2) +
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
facet_wrap(~ field_name, scales = "fixed", ncol = 3) + # Separate plot for each field
labs(
title = "RQ2: Effect of GPT Use on Discussion Metrics, by Field of Study",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_bw(base_size = 9) +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "gray90"),
plot.title.position = "plot"
)
# Initialize a list to store results for each RQ3 subset
results_list_rq3 <- list()
# Loop through the list of RQ3 subsets
for (subset_name in names(rq3_subsets_combined)) {
current_df_rq3 <- rq3_subsets_combined[[subset_name]]
# Initialize a results dataframe for the current subset
results_subset_df <- data.frame(
metric = character(),
coef_GPT = numeric(),
se_GPT = numeric(),
t_value = numeric(),
p_value = numeric(),
sigsig = character(),
stringsAsFactors = FALSE
)
# Loop through all metrics for the current dataframe
for (metric in all_metrics) {
# Build formula with fixed effects
formula <- as.formula(
paste0(metric, " ~ GPT_use + discussion_size + pct_international + pct_posts | discussion_topic_content_clean")
)
# Run regression on the current dataframe
model <- feols(formula, data = current_df_rq3)
# Extract GPT coefficient info
coef_summary <- summary(model)$coeftable
# Check if 'GPT_use' exists in the model results
if ("GPT_use" %in% rownames(coef_summary)) {
gpt_row <- coef_summary["GPT_use", ]
# Determine significance
sig <- case_when(
gpt_row["Pr(>|t|)"] <= 0.001 ~ "***",
gpt_row["Pr(>|t|)"] <= 0.01 ~ "**",
gpt_row["Pr(>|t|)"] <= 0.05 ~ "*",
TRUE ~ ""
)
# Add to the current subset's results dataframe
results_subset_df <- results_subset_df %>%
add_row(
metric = metric,
coef_GPT = gpt_row["Estimate"],
se_GPT = gpt_row["Std. Error"],
t_value = gpt_row["t value"],
p_value = gpt_row["Pr(>|t|)"],
sigsig = sig
)
}
}
# Store the complete results for the current subset in the main list
results_list_rq3[[subset_name]] <- results_subset_df
}
# Combine all RQ3 subset results into one dataframe
all_results_rq3 <- bind_rows(results_list_rq3, .id = "subset_name")
print(all_results_rq3)
## subset_name metric coef_GPT se_GPT
## 1 Conceptual avg_similarity 2.531389e-03 0.015382959
## 2 Conceptual embedding_variance 3.130661e-03 0.015020269
## 3 Conceptual semantic_spread -6.285074e-02 0.031736046
## 4 Conceptual MRS_depth 1.269359e-01 0.038307578
## 5 Conceptual avg_word_count 4.747419e-02 0.021037129
## 6 Conceptual avg_analytic 1.895984e-01 0.033216501
## 7 Conceptual avg_informativeness 4.217785e-02 0.036645558
## 8 Conceptual percent_replied 5.172237e-02 0.011068057
## 9 Conceptual avg_depth 4.902173e-03 0.003025158
## 10 Conceptual max_depth 3.996510e-02 0.020532454
## 11 Conceptual avg_width 5.518516e-03 0.003336772
## 12 Conceptual max_width -6.382930e-02 0.016198296
## 13 Factual avg_similarity -6.938187e-03 0.045745229
## 14 Factual embedding_variance 8.642255e-03 0.042389004
## 15 Factual semantic_spread 4.820670e-02 0.102551413
## 16 Factual MRS_depth 8.613037e-02 0.045048271
## 17 Factual avg_word_count -1.249292e-02 0.028443272
## 18 Factual avg_analytic 6.725825e-02 0.036682178
## 19 Factual avg_informativeness 6.900777e-02 0.056104804
## 20 Factual percent_replied 5.193187e-02 0.035104601
## 21 Factual avg_depth 9.729454e-03 0.012541486
## 22 Factual max_depth 1.211677e-01 0.075169000
## 23 Factual avg_width 1.818188e-02 0.021061312
## 24 Factual max_width -8.816521e-02 0.051243285
## 25 Metacognitive avg_similarity 1.104270e-02 0.024122641
## 26 Metacognitive embedding_variance -1.660771e-03 0.023352717
## 27 Metacognitive semantic_spread -1.318054e-02 0.038427590
## 28 Metacognitive MRS_depth 1.981767e-02 0.038634441
## 29 Metacognitive avg_word_count 5.715306e-03 0.016572230
## 30 Metacognitive avg_analytic 1.676989e-01 0.039124680
## 31 Metacognitive avg_informativeness -5.237807e-02 0.042983204
## 32 Metacognitive percent_replied 4.173795e-02 0.015817950
## 33 Metacognitive avg_depth 3.230395e-03 0.006157868
## 34 Metacognitive max_depth -7.212578e-04 0.028966813
## 35 Metacognitive avg_width -5.538945e-04 0.005042570
## 36 Metacognitive max_width -1.103794e-02 0.022240858
## 37 Procedural avg_similarity 6.233362e-02 0.043637672
## 38 Procedural embedding_variance -5.849178e-02 0.042018808
## 39 Procedural semantic_spread -1.888966e-01 0.119989595
## 40 Procedural MRS_depth -1.346716e-01 0.099036356
## 41 Procedural avg_word_count 1.147371e-01 0.064970625
## 42 Procedural avg_analytic 3.025601e-02 0.047214510
## 43 Procedural avg_informativeness -3.458931e-02 0.187301880
## 44 Procedural percent_replied -2.390325e-02 0.044983010
## 45 Procedural avg_depth -2.158465e-02 0.062302428
## 46 Procedural max_depth 9.047750e-02 0.090284129
## 47 Procedural avg_width -2.547193e-02 0.042948652
## 48 Procedural max_width 6.330911e-03 0.048332048
## 49 Remember avg_similarity 1.825470e-02 0.173180037
## 50 Remember embedding_variance 1.508534e-02 0.146532494
## 51 Remember semantic_spread 1.998034e-01 0.145979324
## 52 Remember MRS_depth 5.588751e-02 0.036194975
## 53 Remember avg_word_count -3.924629e-02 0.030897802
## 54 Remember avg_analytic -2.372769e-02 0.068086366
## 55 Remember avg_informativeness 8.144050e-02 0.122521632
## 56 Remember percent_replied 9.129731e-03 0.034034374
## 57 Remember avg_depth 1.135166e-02 0.034208173
## 58 Remember max_depth 9.809983e-02 0.187675221
## 59 Remember avg_width 5.939667e-03 0.043004297
## 60 Remember max_width -2.493942e-02 0.053767846
## 61 Understand avg_similarity 2.874144e-02 0.021668629
## 62 Understand embedding_variance -3.304744e-02 0.021211718
## 63 Understand semantic_spread -4.185762e-02 0.081477092
## 64 Understand MRS_depth 3.878261e-02 0.039963072
## 65 Understand avg_word_count -3.394024e-02 0.018239536
## 66 Understand avg_analytic 8.865502e-02 0.022563489
## 67 Understand avg_informativeness 2.184231e-02 0.053608783
## 68 Understand percent_replied 4.901041e-02 0.021980696
## 69 Understand avg_depth 1.306712e-02 0.006120530
## 70 Understand max_depth 2.244351e-02 0.054651812
## 71 Understand avg_width 1.758338e-02 0.010435395
## 72 Understand max_width -7.702887e-02 0.035915283
## 73 Analyze avg_similarity 1.328999e-02 0.023143457
## 74 Analyze embedding_variance -1.583466e-02 0.022549331
## 75 Analyze semantic_spread -5.940332e-02 0.058474535
## 76 Analyze MRS_depth -8.601259e-02 0.055320202
## 77 Analyze avg_word_count -4.248803e-02 0.030733076
## 78 Analyze avg_analytic 6.878766e-02 0.020434840
## 79 Analyze avg_informativeness -4.731850e-02 0.064771729
## 80 Analyze percent_replied 7.260251e-03 0.015156126
## 81 Analyze avg_depth -4.498438e-04 0.004402931
## 82 Analyze max_depth -6.279540e-02 0.033992193
## 83 Analyze avg_width 1.572861e-03 0.005011922
## 84 Analyze max_width -3.261156e-02 0.023197842
## 85 Apply avg_similarity -2.390456e-03 0.043388879
## 86 Apply embedding_variance -9.062013e-03 0.040494490
## 87 Apply semantic_spread 1.376479e-02 0.119208862
## 88 Apply MRS_depth -1.387134e-01 0.117551626
## 89 Apply avg_word_count 5.316527e-02 0.040523055
## 90 Apply avg_analytic 5.361716e-02 0.050822345
## 91 Apply avg_informativeness -2.165877e-02 0.165394484
## 92 Apply percent_replied -3.102590e-04 0.046450078
## 93 Apply avg_depth 2.746244e-03 0.018923065
## 94 Apply max_depth 6.879845e-02 0.097680558
## 95 Apply avg_width 8.491647e-03 0.024561511
## 96 Apply max_width 4.918955e-02 0.071408044
## 97 Evaluate avg_similarity -1.417655e-02 0.020318564
## 98 Evaluate embedding_variance 2.553848e-02 0.019621020
## 99 Evaluate semantic_spread -4.987853e-02 0.033863011
## 100 Evaluate MRS_depth 1.651973e-01 0.042645178
## 101 Evaluate avg_word_count 6.453134e-02 0.020734194
## 102 Evaluate avg_analytic 2.399010e-01 0.042032079
## 103 Evaluate avg_informativeness 6.810347e-02 0.040811952
## 104 Evaluate percent_replied 8.190668e-02 0.013409916
## 105 Evaluate avg_depth 4.924346e-03 0.004319835
## 106 Evaluate max_depth 6.613770e-02 0.023214619
## 107 Evaluate avg_width 5.000044e-03 0.003423585
## 108 Evaluate max_width -5.810913e-02 0.017886575
## 109 Create avg_similarity 4.223489e-02 0.022668943
## 110 Create embedding_variance -3.285425e-02 0.021922399
## 111 Create semantic_spread -3.461674e-02 0.051165309
## 112 Create MRS_depth 3.933948e-02 0.032430891
## 113 Create avg_word_count 3.335922e-02 0.026573544
## 114 Create avg_analytic 1.485961e-01 0.042186255
## 115 Create avg_informativeness -1.024676e-01 0.056970657
## 116 Create percent_replied 1.232573e-02 0.020975884
## 117 Create avg_depth 5.412446e-04 0.010448701
## 118 Create max_depth 1.382629e-02 0.030639811
## 119 Create avg_width -6.338086e-03 0.009048055
## 120 Create max_width -1.404213e-02 0.028787478
## 121 academic_purpose avg_similarity 9.074311e-05 0.014619973
## 122 academic_purpose embedding_variance 5.938514e-03 0.014252079
## 123 academic_purpose semantic_spread -5.062477e-02 0.025512163
## 124 academic_purpose MRS_depth 9.412378e-02 0.030718810
## 125 academic_purpose avg_word_count 2.780500e-02 0.015501997
## 126 academic_purpose avg_analytic 1.823023e-01 0.026080435
## 127 academic_purpose avg_informativeness 1.695409e-03 0.029892689
## 128 academic_purpose percent_replied 4.987550e-02 0.010125311
## 129 academic_purpose avg_depth 1.718419e-03 0.003618323
## 130 academic_purpose max_depth 2.426550e-02 0.018735667
## 131 academic_purpose avg_width 5.055129e-04 0.003076711
## 132 academic_purpose max_width -4.246096e-02 0.013740186
## 133 social_purpose avg_similarity 7.081662e-02 0.021936975
## 134 social_purpose embedding_variance -5.938132e-02 0.020835991
## 135 social_purpose semantic_spread 1.576654e-01 0.069927892
## 136 social_purpose MRS_depth -1.055531e-01 0.056705482
## 137 social_purpose avg_word_count -3.903140e-02 0.021389250
## 138 social_purpose avg_analytic 8.175870e-02 0.025597045
## 139 social_purpose avg_informativeness 4.756487e-04 0.084832015
## 140 social_purpose percent_replied 4.957204e-02 0.019191794
## 141 social_purpose avg_depth 3.781138e-03 0.009673251
## 142 social_purpose max_depth -2.140583e-02 0.038214563
## 143 social_purpose avg_width 1.099134e-02 0.012969658
## 144 social_purpose max_width -3.183582e-02 0.035164618
## 145 logistics_purpose avg_similarity -1.353468e-02 0.020877881
## 146 logistics_purpose embedding_variance 1.613385e-02 0.020464455
## 147 logistics_purpose semantic_spread -5.500579e-02 0.066524659
## 148 logistics_purpose MRS_depth 3.334903e-02 0.035404659
## 149 logistics_purpose avg_word_count 6.442925e-02 0.034422915
## 150 logistics_purpose avg_analytic 2.539181e-01 0.069746355
## 151 logistics_purpose avg_informativeness 5.471802e-02 0.056031629
## 152 logistics_purpose percent_replied 7.199088e-03 0.017824708
## 153 logistics_purpose avg_depth -1.691704e-02 0.011498671
## 154 logistics_purpose max_depth 3.903285e-02 0.040202688
## 155 logistics_purpose avg_width -1.485298e-02 0.010013254
## 156 logistics_purpose max_width -4.192351e-02 0.045706259
## 157 knowledge_reproduction avg_similarity 3.057160e-02 0.044581027
## 158 knowledge_reproduction embedding_variance -3.667423e-02 0.042227807
## 159 knowledge_reproduction semantic_spread 2.235642e-01 0.187078852
## 160 knowledge_reproduction MRS_depth 4.840812e-02 0.045857348
## 161 knowledge_reproduction avg_word_count -1.005250e-01 0.082455522
## 162 knowledge_reproduction avg_analytic 9.226357e-02 0.044972933
## 163 knowledge_reproduction avg_informativeness -2.588848e-01 0.213085182
## 164 knowledge_reproduction percent_replied -3.931596e-02 0.036276489
## 165 knowledge_reproduction avg_depth -1.137908e-02 0.009675375
## 166 knowledge_reproduction max_depth -2.161743e-01 0.136963724
## 167 knowledge_reproduction avg_width -1.048937e-03 0.008248571
## 168 knowledge_reproduction max_width -2.002576e-02 0.066847269
## 169 open_qa avg_similarity -1.166194e-02 0.060452359
## 170 open_qa embedding_variance 1.983962e-02 0.058136578
## 171 open_qa semantic_spread -1.184601e-01 0.200970518
## 172 open_qa MRS_depth -2.404695e-02 0.030438963
## 173 open_qa avg_word_count -1.066087e-02 0.030154185
## 174 open_qa avg_analytic -6.591060e-02 0.069458366
## 175 open_qa avg_informativeness -9.383234e-02 0.081406388
## 176 open_qa percent_replied 1.144998e-01 0.035489111
## 177 open_qa avg_depth 8.675154e-03 0.026301278
## 178 open_qa max_depth 2.114916e-01 0.152351974
## 179 open_qa avg_width -2.766310e-02 0.026956595
## 180 open_qa max_width -2.848067e-01 0.085964500
## 181 synthesis_application avg_similarity -2.874174e-02 0.020526602
## 182 synthesis_application embedding_variance 3.139672e-02 0.020260867
## 183 synthesis_application semantic_spread -6.672425e-02 0.029475262
## 184 synthesis_application MRS_depth 8.448195e-02 0.043286847
## 185 synthesis_application avg_word_count 4.843257e-02 0.020430706
## 186 synthesis_application avg_analytic 2.163614e-01 0.036051812
## 187 synthesis_application avg_informativeness 2.937099e-02 0.036765557
## 188 synthesis_application percent_replied 4.269254e-02 0.012530738
## 189 synthesis_application avg_depth 4.582789e-03 0.004955156
## 190 synthesis_application max_depth 5.433943e-02 0.020288780
## 191 synthesis_application avg_width 2.089875e-03 0.003603361
## 192 synthesis_application max_width -1.587836e-02 0.016297927
## 193 personal_connection avg_similarity 1.962604e-03 0.020639482
## 194 personal_connection embedding_variance 6.929977e-03 0.019962218
## 195 personal_connection semantic_spread -3.473864e-03 0.036084449
## 196 personal_connection MRS_depth 9.082662e-02 0.039318138
## 197 personal_connection avg_word_count 1.634237e-02 0.020161512
## 198 personal_connection avg_analytic 2.004856e-01 0.039609916
## 199 personal_connection avg_informativeness 4.501023e-03 0.038738282
## 200 personal_connection percent_replied 5.922759e-02 0.013104658
## 201 personal_connection avg_depth 5.999611e-03 0.004700297
## 202 personal_connection max_depth 1.129853e-02 0.025887450
## 203 personal_connection avg_width 3.658704e-03 0.003592509
## 204 personal_connection max_width -1.890573e-02 0.018703903
## 205 collaborative_interaction avg_similarity -1.538006e-02 0.020214672
## 206 collaborative_interaction embedding_variance 2.005004e-02 0.019890314
## 207 collaborative_interaction semantic_spread -6.436680e-02 0.030650072
## 208 collaborative_interaction MRS_depth 1.438568e-01 0.039381455
## 209 collaborative_interaction avg_word_count 3.734275e-02 0.020682689
## 210 collaborative_interaction avg_analytic 2.443956e-01 0.034086195
## 211 collaborative_interaction avg_informativeness 4.247767e-02 0.034348494
## 212 collaborative_interaction percent_replied 6.788847e-02 0.012012129
## 213 collaborative_interaction avg_depth -1.156034e-03 0.004971934
## 214 collaborative_interaction max_depth 1.544803e-02 0.024319386
## 215 collaborative_interaction avg_width -2.433068e-03 0.004182134
## 216 collaborative_interaction max_width -5.315477e-02 0.019390284
## t_value p_value sigsig
## 1 0.164557983 8.693776e-01
## 2 0.208429067 8.350033e-01
## 3 -1.980421271 4.836214e-02 *
## 4 3.313597315 1.007612e-03 **
## 5 2.256685985 2.458346e-02 *
## 6 5.707958935 2.274429e-08 ***
## 7 1.150967699 2.504543e-01
## 8 4.673120556 4.098144e-06 ***
## 9 1.620468721 1.059440e-01
## 10 1.946435458 5.232390e-02
## 11 1.653848687 9.896702e-02
## 12 -3.940494755 9.643217e-05 ***
## 13 -0.151670173 8.799734e-01
## 14 0.203879640 8.391612e-01
## 15 0.470073437 6.400653e-01
## 16 1.911957335 6.082553e-02
## 17 -0.439222357 6.621325e-01
## 18 1.833540170 7.185504e-02
## 19 1.229979688 2.236705e-01
## 20 1.479346400 1.444590e-01
## 21 0.775781591 4.410299e-01
## 22 1.611936872 1.124046e-01
## 23 0.863283235 3.915376e-01
## 24 -1.720522225 9.066909e-02
## 25 0.457773239 6.473561e-01
## 26 -0.071116804 9.433392e-01
## 27 -0.342996683 7.317755e-01
## 28 0.512953476 6.082581e-01
## 29 0.344872484 7.303658e-01
## 30 4.286269528 2.263645e-05 ***
## 31 -1.218570697 2.237028e-01
## 32 2.638644634 8.638556e-03 **
## 33 0.524596347 6.001453e-01
## 34 -0.024899453 9.801472e-01
## 35 -0.109843699 9.125867e-01
## 36 -0.496291146 6.199530e-01
## 37 1.428435985 1.600709e-01
## 38 -1.392038179 1.707529e-01
## 39 -1.574275119 1.224291e-01
## 40 -1.359819624 1.806615e-01
## 41 1.765983693 8.418234e-02
## 42 0.640820107 5.248897e-01
## 43 -0.184671441 8.543165e-01
## 44 -0.531383954 5.977661e-01
## 45 -0.346449576 7.306197e-01
## 46 1.002141835 3.216336e-01
## 47 -0.593078773 5.560976e-01
## 48 0.130987851 8.963688e-01
## 49 0.105408816 9.172172e-01
## 50 0.102948765 9.191419e-01
## 51 1.368709935 1.879306e-01
## 52 1.544068070 1.399708e-01
## 53 -1.270196914 2.201888e-01
## 54 -0.348494061 7.315120e-01
## 55 0.664703036 5.146669e-01
## 56 0.268250309 7.915567e-01
## 57 0.331840482 7.438424e-01
## 58 0.522710614 6.075502e-01
## 59 0.138117996 8.916805e-01
## 60 -0.463835145 6.483259e-01
## 61 1.326407697 1.869950e-01
## 62 -1.557980179 1.216323e-01
## 63 -0.513734808 6.082963e-01
## 64 0.970461117 3.335916e-01
## 65 -1.860806277 6.499638e-02
## 66 3.929136188 1.368237e-04 ***
## 67 0.407438971 6.843456e-01
## 68 2.229702327 2.745701e-02 *
## 69 2.134965032 3.461127e-02 *
## 70 0.410663624 6.819851e-01
## 71 1.684974739 9.435628e-02
## 72 -2.144737828 3.380536e-02 *
## 73 0.574244030 5.664679e-01
## 74 -0.702222843 4.833819e-01
## 75 -1.015883539 3.109501e-01
## 76 -1.554813363 1.216204e-01
## 77 -1.382485496 1.684121e-01
## 78 3.366195173 9.187795e-04 ***
## 79 -0.730542472 4.659400e-01
## 80 0.479030778 6.324562e-01
## 81 -0.102169165 9.187279e-01
## 82 -1.847347822 6.621952e-02
## 83 0.313823962 7.539919e-01
## 84 -1.405801270 1.613827e-01
## 85 -0.055093740 9.563250e-01
## 86 -0.223783856 8.240105e-01
## 87 0.115467855 9.086246e-01
## 88 -1.180021511 2.446322e-01
## 89 1.311975887 1.966542e-01
## 90 1.054991858 2.974603e-01
## 91 -0.130952202 8.964384e-01
## 92 -0.006679408 9.947023e-01
## 93 0.145126785 8.853050e-01
## 94 0.704320861 4.851202e-01
## 95 0.345729821 7.312716e-01
## 96 0.688851719 4.947045e-01
## 97 -0.697714396 4.859607e-01
## 98 1.301587582 1.941751e-01
## 99 -1.472950300 1.419379e-01
## 100 3.873761827 1.347699e-04 ***
## 101 3.112314870 2.057101e-03 **
## 102 5.707568709 3.032197e-08 ***
## 103 1.668713942 9.634186e-02
## 104 6.107919246 3.535603e-09 ***
## 105 1.139938646 2.553296e-01
## 106 2.848967671 4.726386e-03 **
## 107 1.460470051 1.453322e-01
## 108 -3.248756673 1.306968e-03 **
## 109 1.863117064 6.363754e-02
## 110 -1.498661181 1.352437e-01
## 111 -0.676566686 4.993166e-01
## 112 1.213025112 2.262837e-01
## 113 1.255354526 2.105412e-01
## 114 3.522381270 5.096699e-04 ***
## 115 -1.798603353 7.330724e-02
## 116 0.587614466 5.573301e-01
## 117 0.051800176 9.587300e-01
## 118 0.451252370 6.522052e-01
## 119 -0.700491559 4.842824e-01
## 120 -0.487785953 6.261359e-01
## 121 0.006206790 9.950495e-01
## 122 0.416677055 6.770426e-01
## 123 -1.984338631 4.760873e-02 *
## 124 3.064043758 2.267633e-03 **
## 125 1.793640050 7.330277e-02
## 126 6.990001657 6.428035e-12 ***
## 127 0.056716522 9.547872e-01
## 128 4.925824528 1.049502e-06 ***
## 129 0.474921372 6.349914e-01
## 130 1.295150066 1.956961e-01
## 131 0.164303040 8.695401e-01
## 132 -3.090275726 2.079072e-03 **
## 133 3.228185263 1.561512e-03 **
## 134 -2.849939887 5.053881e-03 **
## 135 2.254685036 2.575135e-02 *
## 136 -1.861427583 6.484250e-02
## 137 -1.824813936 7.022367e-02
## 138 3.194067910 1.743551e-03 **
## 139 0.005606948 9.955345e-01
## 140 2.582980848 1.085137e-02 *
## 141 0.390885968 6.964932e-01
## 142 -0.560148600 5.762996e-01
## 143 0.847465389 3.982247e-01
## 144 -0.905336659 3.668881e-01
## 145 -0.648278437 5.178302e-01
## 146 0.788384075 4.317591e-01
## 147 -0.826848096 4.096818e-01
## 148 0.941939014 3.477905e-01
## 149 1.871696380 6.326306e-02
## 150 3.640592414 3.777047e-04 ***
## 151 0.976555884 3.304155e-01
## 152 0.403882545 6.868941e-01
## 153 -1.471217287 1.433993e-01
## 154 0.970901455 3.332141e-01
## 155 -1.483331827 1.401566e-01
## 156 -0.917237877 3.605411e-01
## 157 0.685753530 4.963865e-01
## 158 -0.868485351 3.897371e-01
## 159 1.195026710 2.383369e-01
## 160 1.055623978 2.967747e-01
## 161 -1.219142599 2.291422e-01
## 162 2.051535512 4.606198e-02 *
## 163 -1.214935506 2.307272e-01
## 164 -1.083786116 2.842325e-01
## 165 -1.176086440 2.457448e-01
## 166 -1.578332418 1.214939e-01
## 167 -0.127165904 8.993756e-01
## 168 -0.299574771 7.658809e-01
## 169 -0.192911325 8.476799e-01
## 170 0.341258900 7.341009e-01
## 171 -0.589440228 5.577791e-01
## 172 -0.790005507 4.326358e-01
## 173 -0.353545428 7.249188e-01
## 174 -0.948922351 3.464667e-01
## 175 -1.152640978 2.536294e-01
## 176 3.226336378 2.032209e-03 **
## 177 0.329837748 7.426714e-01
## 178 1.388177717 1.702159e-01
## 179 -1.026209054 3.089126e-01
## 180 -3.313073348 1.567320e-03 **
## 181 -1.400218884 1.622510e-01
## 182 1.549623808 1.220513e-01
## 183 -2.263737366 2.414473e-02 *
## 184 1.951677022 5.169940e-02
## 185 2.370577460 1.825085e-02 *
## 186 6.001401164 4.510430e-09 ***
## 187 0.798872386 4.248557e-01
## 188 3.407025184 7.258191e-04 ***
## 189 0.924852555 3.556204e-01
## 190 2.678299775 7.715862e-03 **
## 191 0.579979317 5.622669e-01
## 192 -0.974256123 3.305395e-01
## 193 0.095089786 9.242921e-01
## 194 0.347154664 7.286614e-01
## 195 -0.096270385 9.233550e-01
## 196 2.310043739 2.140444e-02 *
## 197 0.810572674 4.181029e-01
## 198 5.061499242 6.409637e-07 ***
## 199 0.116190556 9.075610e-01
## 200 4.519583007 8.215935e-06 ***
## 201 1.276432144 2.025585e-01
## 202 0.436448339 6.627518e-01
## 203 1.018425829 3.091038e-01
## 204 -1.010790442 3.127402e-01
## 205 -0.760836383 4.472342e-01
## 206 1.008030482 3.140915e-01
## 207 -2.100053927 3.639342e-02 *
## 208 3.652907238 2.962844e-04 ***
## 209 1.805507476 7.179929e-02
## 210 7.169929080 4.040396e-12 ***
## 211 1.236667533 2.169865e-01
## 212 5.651660108 3.155234e-08 ***
## 213 -0.232512004 8.162675e-01
## 214 0.635214777 5.256768e-01
## 215 -0.581776797 5.610675e-01
## 216 -2.741309377 6.413278e-03 **
library(forcats)
rq3_plot_data <- all_results_rq3 %>%
mutate(
# Create the top-level grouping variable for faceting or filtering
framework = case_when(
subset_name %in% c("Factual", "Conceptual", "Procedural", "Metacognitive") ~ "Knowledge Type",
subset_name %in% c("Remember", "Understand", "Analyze", "Evaluate", "Apply", "Create") ~ "Cognitive Process",
subset_name %in% c("academic_purpose", "social_purpose", "logistics_purpose") ~ "Purpose Type",
TRUE ~ "Academic Type" # Catches the remaining 5 academic types
),
# Define metric category for color coding
category = case_when(
metric %in% content_metrics ~ "Content Creation",
TRUE ~ "Social Engagement"
),
# Calculate confidence bounds (95% CI)
lower_ci = coef_GPT - (1.96 * se_GPT),
upper_ci = coef_GPT + (1.96 * se_GPT)
)
# Filter data for Knowledge Types
knowledge_data <- rq3_plot_data %>%
filter(framework == "Knowledge Type") %>%
# Reorder subset_name for better presentation within the facet
mutate(subset_name_ordered = fct_relevel(subset_name, "Factual", "Conceptual", "Procedural", "Metacognitive"),
metric = factor(metric, levels = rev(custom_metric_order)))
ggplot(knowledge_data, aes(x = coef_GPT, y = metric, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
geom_point(size = 0.06) +
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(knowledge_data$coef_GPT > 0, -0.4, 1.4),
vjust = 0.5, size = 2) +
facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 4 knowledge types
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
labs(
title = "RQ3: Effect of GPT Use on Metrics, by Knowledge Type of Discussion Prompt",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_bw(base_size = 6) +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "gray85"),
plot.title.position = "plot"
)
# Filter data for Cognitive Process Types
cognitive_data <- rq3_plot_data %>%
filter(framework == "Cognitive Process") %>%
# Reorder subset_name for better presentation (e.g., Bloom's Taxonomy order)
mutate(subset_name_ordered = fct_relevel(subset_name, "Remember", "Understand", "Apply", "Analyze", "Evaluate", "Create"),
metric = factor(metric, levels = rev(custom_metric_order)))
ggplot(cognitive_data, aes(x = coef_GPT, y = metric, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
geom_point(size = 0.06) +
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(cognitive_data$coef_GPT > 0, -0.4, 1.4),
vjust = 0.5, size = 2) +
facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 3) + # Facet by the 6 cognitive types
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
labs(
title = "RQ3: Effect of GPT Use on Metrics, by Cognitive Process Type of Discussion Prompt",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_bw(base_size = 6) +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "gray85"),
plot.title.position = "plot"
)
# Filter data for Purpose Types
purpose_data <- rq3_plot_data %>%
filter(framework == "Purpose Type") %>%
# Reorder for better presentation
mutate(subset_name_ordered = fct_relevel(subset_name, "academic_purpose", "social_purpose", "logistics_purpose"),
metric = factor(metric, levels = rev(custom_metric_order)))
ggplot(purpose_data, aes(x = coef_GPT, y = metric, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
geom_point(size = 0.06) +
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(purpose_data$coef_GPT > 0, -0.4, 1.4),
vjust = 0.5, size = 2) +
facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 3 purpose types
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
labs(
title = "RQ3: Effect of GPT Use on Metrics, by Purpose Type of Discussion Prompt",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_bw(base_size = 6) +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "gray85"),
plot.title.position = "plot"
)
# Filter data for Academic Types
academic_data <- rq3_plot_data %>%
filter(framework == "Academic Type") %>%
# Reorder for better presentation
mutate(subset_name_ordered = fct_relevel(subset_name, "knowledge_reproduction", "open_qa", "synthesis_application", "personal_connection", "collaborative_interaction"),
metric = factor(metric, levels = rev(custom_metric_order)))
ggplot(academic_data, aes(x = coef_GPT, y = metric, color = category)) +
geom_vline(xintercept = 0, linetype = "dashed", alpha = 0.2, linewidth = 0.5) +
geom_errorbarh(aes(xmin = lower_ci, xmax = upper_ci), height = 0.2, linewidth = 0.09) +
geom_point(size = 0.06) +
geom_text(aes(label = sigsig, x = coef_GPT),
hjust = ifelse(academic_data$coef_GPT > 0, -0.4, 1.4),
vjust = 0.5, size = 2) +
facet_wrap(~ subset_name_ordered, scales = "free_y", ncol = 2) + # Facet by the 5 academic types
scale_color_manual(values = c("Content Creation" = "#0072B2", "Social Engagement" = "#D55E00")) +
labs(
title = "RQ3: Effect of GPT Use on Metrics, by Academic Type of Discussion Prompt",
x = "Coefficient of GPT_use (Fixed Effects)",
y = "Metric",
color = "Metric Category"
) +
theme_bw(base_size = 6) +
theme(
legend.position = "bottom",
strip.background = element_rect(fill = "gray85"),
plot.title.position = "plot"
)